{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "import tensorflow as tf\n",
    "import numpy as np\n",
    "import csv\n",
    "from tqdm import tqdm\n",
    "import pandas as pd\n",
    "import random\n",
    "import pickle\n",
    "\n",
    "root = '/data/private/Ad/amazon/'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(root+'Electronics_5.json') as fin:\n",
    "    df = {}\n",
    "    for i, line in enumerate(fin):\n",
    "        df[i] = eval(line)\n",
    "    reviews_df = pd.DataFrame.from_dict(df, orient='index')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(root+'np_prepro/reviews.pkl', 'wb') as f:\n",
    "    pickle.dump(reviews_df, f, pickle.HIGHEST_PROTOCOL)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(root+'meta_Electronics.json') as fin:\n",
    "    df = {}\n",
    "    for i, line in enumerate(fin):\n",
    "        df[i] = eval(line)\n",
    "    meta_df = pd.DataFrame.from_dict(df, orient='index')\n",
    "\n",
    "meta_df = meta_df[meta_df['asin'].isin(reviews_df['asin'].unique())]\n",
    "meta_df = meta_df.reset_index(drop=True)\n",
    "with open(root+'np_prepro/meta.pkl', 'wb') as f:\n",
    "    pickle.dump(meta_df, f, pickle.HIGHEST_PROTOCOL)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "reviews_df = reviews_df[['reviewerID', 'asin', 'unixReviewTime']]\n",
    "meta_df = meta_df[['asin', 'categories']]\n",
    "# only one category...\n",
    "meta_df['categories'] = meta_df['categories'].map(lambda x: x[-1][-1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "def build_map(df, col_name):\n",
    "    key = sorted(df[col_name].unique().tolist())\n",
    "    m = dict(zip(key, range(len(key))))\n",
    "    df[col_name] = df[col_name].map(lambda x: m[x])\n",
    "    return m, key"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "asin_map, asin_key = build_map(meta_df, 'asin')\n",
    "cate_map, cate_key = build_map(meta_df, 'categories')\n",
    "revi_map, revi_key = build_map(reviews_df, 'reviewerID')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "user_count: 192403\titem_count: 63001\tcate_count: 801\texample_count: 1689188\n"
     ]
    }
   ],
   "source": [
    "user_count, item_count, cate_count, example_count =\\\n",
    "    len(revi_map), len(asin_map), len(cate_map), reviews_df.shape[0]\n",
    "print('user_count: %d\\titem_count: %d\\tcate_count: %d\\texample_count: %d' %\n",
    "      (user_count, item_count, cate_count, example_count))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "meta_df = meta_df.sort_values('asin')\n",
    "meta_df = meta_df.reset_index(drop=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "collapsed": true,
    "jupyter": {
     "outputs_hidden": true
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "         reviewerID   asin  unixReviewTime\n",
      "0                 0  13179      1400457600\n",
      "1                 0  17993      1400457600\n",
      "2                 0  28326      1400457600\n",
      "3                 0  29247      1400457600\n",
      "4                 0  62275      1400457600\n",
      "5                 1  58134      1379548800\n",
      "6                 1  62555      1379548800\n",
      "7                 1  41862      1384041600\n",
      "8                 1  46010      1385769600\n",
      "9                 1  54171      1385769600\n",
      "10                1  56540      1385769600\n",
      "11                2  42298      1366156800\n",
      "12                2  46782      1366156800\n",
      "13                2  50682      1366156800\n",
      "14                2  42390      1370563200\n",
      "15                2  47355      1370563200\n",
      "16                3  25578      1371772800\n",
      "17                3  21989      1375142400\n",
      "18                3  58444      1402876800\n",
      "19                3  60072      1402876800\n",
      "20                3  62274      1402876800\n",
      "21                4  54245      1359331200\n",
      "22                4   3112      1361145600\n",
      "23                4  40094      1361145600\n",
      "24                4  48963      1361145600\n",
      "25                4  30275      1389744000\n",
      "26                4  58671      1402358400\n",
      "27                4  62022      1402358400\n",
      "28                5  30462      1373241600\n",
      "29                5  55698      1373241600\n",
      "...             ...    ...             ...\n",
      "1689158      192402  36004      1357776000\n",
      "1689159      192402  37977      1357776000\n",
      "1689160      192402  39411      1357776000\n",
      "1689161      192402   7681      1358035200\n",
      "1689162      192402  18186      1358035200\n",
      "1689163      192402  27522      1358035200\n",
      "1689164      192402  29206      1358035200\n",
      "1689165      192402  30547      1358035200\n",
      "1689166      192402  42076      1358035200\n",
      "1689167      192402  29518      1377907200\n",
      "1689168      192402  35691      1377907200\n",
      "1689169      192402  44123      1377907200\n",
      "1689170      192402  54615      1377907200\n",
      "1689171      192402  45465      1385856000\n",
      "1689172      192402  48862      1385856000\n",
      "1689173      192402  52445      1385856000\n",
      "1689174      192402  60275      1385856000\n",
      "1689175      192402  51004      1386633600\n",
      "1689176      192402  53509      1386633600\n",
      "1689177      192402  61519      1386633600\n",
      "1689178      192402  28581      1388534400\n",
      "1689179      192402  29369      1388534400\n",
      "1689180      192402  41590      1388534400\n",
      "1689181      192402  51306      1388534400\n",
      "1689182      192402  49816      1389744000\n",
      "1689183      192402  57576      1389744000\n",
      "1689184      192402  22519      1396396800\n",
      "1689185      192402  20977      1404172800\n",
      "1689186      192402  60283      1404172800\n",
      "1689187      192402  62677      1405123200\n",
      "\n",
      "[1689188 rows x 3 columns]\n"
     ]
    }
   ],
   "source": [
    "reviews_df['asin'] = reviews_df['asin'].map(lambda x: asin_map[x])\n",
    "reviews_df = reviews_df.sort_values(['reviewerID', 'unixReviewTime'])\n",
    "reviews_df = reviews_df.reset_index(drop=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "cate_list = [meta_df['categories'][i] for i in range(len(asin_map))]\n",
    "cate_list = np.array(cate_list, dtype=np.int32)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(root+'np_prepro/remap.pkl', 'wb') as f:\n",
    "    pickle.dump(reviews_df, f, pickle.HIGHEST_PROTOCOL) # uid, iid\n",
    "    pickle.dump(cate_list, f, pickle.HIGHEST_PROTOCOL) # cid of iid line\n",
    "    pickle.dump((user_count, item_count, cate_count, example_count),\n",
    "              f, pickle.HIGHEST_PROTOCOL)\n",
    "    pickle.dump((asin_key, cate_key, revi_key), f, pickle.HIGHEST_PROTOCOL)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "random.seed(1234)\n",
    "\n",
    "train_set = []\n",
    "test_set = []\n",
    "for reviewerID, hist in reviews_df.groupby('reviewerID'):\n",
    "    pos_list = hist['asin'].tolist()\n",
    "    neg_list = []\n",
    "    for _ in range(len(pos_list)):\n",
    "        neg = pos_list[0]\n",
    "        while neg in pos_list + neg_list :\n",
    "            neg = random.randint(0, item_count-1)\n",
    "        neg_list.append(neg)\n",
    "        \n",
    "    for i in range(1, len(pos_list)-1):\n",
    "        hist = pos_list[:i]\n",
    "        train_set.append((reviewerID, hist, pos_list[i], 1))\n",
    "        train_set.append((reviewerID, hist, neg_list[i], 0))\n",
    "    label = (pos_list[-1], neg_list[-1])\n",
    "    test_set.append((reviewerID, hist, label))\n",
    "\n",
    "random.shuffle(train_set)\n",
    "random.shuffle(test_set)\n",
    "\n",
    "assert len(test_set) == user_count\n",
    "\n",
    "with open(root+'np_prepro/dataset.pkl', 'wb') as f:\n",
    "    pickle.dump(train_set, f, pickle.HIGHEST_PROTOCOL)\n",
    "    pickle.dump(test_set, f, pickle.HIGHEST_PROTOCOL)\n",
    "    pickle.dump(cate_list, f, pickle.HIGHEST_PROTOCOL)\n",
    "    pickle.dump((user_count, item_count, cate_count), f, pickle.HIGHEST_PROTOCOL)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
